library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.6 v dplyr 1.0.8
## v tidyr 1.2.0 v stringr 1.4.0
## v readr 2.1.2 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(dplyr)
library(readr)
library(networkD3)
library(visNetwork)
library(ggplot2)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(sjPlot)
## Registered S3 method overwritten by 'parameters':
## method from
## format.parameters_distribution datawizard
## Install package "strengejacke" from GitHub (`devtools::install_github("strengejacke/strengejacke")`) to load all sj-packages at once!
library(wordcloud)
## Loading required package: RColorBrewer
library(tidytext)
library(DT)
##
## Attaching package: 'DT'
## The following object is masked from 'package:networkD3':
##
## JS
library(leaflet)
##
## Attaching package: 'leaflet'
##
## The following object is masked from 'package:networkD3':
##
## JS
library(readr)
library(ggthemes)
library(maps)
##
## Attaching package: 'maps'
## The following object is masked from 'package:purrr':
##
## map
library(stringr)
library(tm)
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
library(RColorBrewer)
library("data.table")
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
## The following object is masked from 'package:purrr':
##
## transpose
library(ggmap)
## Google's Terms of Service: https://cloud.google.com/maps-platform/terms/.
## Please cite ggmap if you use it! See citation("ggmap") for details.
##
## Attaching package: 'ggmap'
## The following object is masked from 'package:plotly':
##
## wind
register_google(key = "AIzaSyBlZS1dMHmz5qAZww5VQRBBSlJa3VFfyVg", write = TRUE)
## Replacing old key (AIzaSyBlZS1dMHmz5qAZww5VQRBBSlJa3VFfyVg) with new key in C:/Users/DELL/Documents/.Renviron
## 8. Append: Network of bars
rb <- barreview %>%
group_by(business_id) %>%
summarize(count = n()) %>%
arrange(desc(count)) %>%
head(50)
r50 <- barreview %>%
filter(business_id %in% rb$business_id) %>%
select(user_id, business_id)
## One bar have a branch in very close location also listed in top50, so actually the number of bars is 49
r50 <- left_join(r50, bar, by = "business_id")
r50 <- r50[, c("user_id", "name", "neighborhood", "review_count")]
r50$name<-gsub('["]',"", r50$name)
r50$name <- ifelse(r50$name == "Bachi Burger" & r50$neighborhood == "Southeast", "Bachi Burger(SE)", r50$name)
r50 <- mutate(r50, barnum = as.factor(name))
barlink <- NULL
for (i in 1 : 48) {
for (j in i : 48) {
a <- r50 %>% filter(as.numeric(barnum) == i)
a <- a$user_id
b <- r50 %>% filter(as.numeric(barnum) == j)
b <- b$user_id
c <- intersect(a, b)
d <- length(c)
tmp <- c(i, j, d)
barlink <- rbind(barlink, tmp)
}
}
barlink <- as.data.frame(barlink)
barlink <- barlink %>%
filter(V1 != V2) %>%
filter(V3 > 0)
barlink <- rename(barlink, source = V1, target = V2, value = V3)
barlink2 <- barlink %>% filter(value >= 50)
barlink2$value <- barlink2$value / 50
barlink2$source = barlink2$source - 1
barlink2$target = barlink2$target - 1
barnode <- r50[, c("barnum", "neighborhood", "review_count")] %>%
distinct()
barnode$neighborhood <- as.factor(barnode$neighborhood)
barnode$rcsize <- barnode$review_count / 100 - 10
p8 <- forceNetwork(Links = barlink2,
Nodes = barnode,
Source = "source",
Target = "target",
Value = "value",
NodeID = "barnum",
Nodesize = "rcsize",
Group = "neighborhood",
opacity = 0.6, zoom = TRUE)
p8
## Descriptive graph
rc <- barreview %>%
group_by(business_id) %>%
summarize(avguseful = mean(useful), count = n())
bar <- inner_join(bar, rc, by = "business_id")
rmost <- bar %>%
arrange(desc(count)) %>%
head(10)
rmost[8, 4] <- "Downtown"
rmost <- rmost[, c("count", "name", "neighborhood", "categories")]
rmost <- rmost[order(rmost[ ,"count"], decreasing = TRUE), ]
reviewgraph <- ggplot(rmost, aes(x = count, y = reorder(name, count), fill = neighborhood, text = paste("categories:", categories))) +
geom_bar(stat="identity", width=1, color="white") +
labs(x="review count", y="name of bar")
## 2. Which bars are reviewed most?
p2 <- ggplotly(reviewgraph)
p2
neighborhood <- bar %>%
group_by(neighborhood) %>%
summarize(avgstar = mean(stars), count = n())
neighborhood[1, 1] <- "Not known"
neighborhoodgraph <- ggplot(neighborhood, aes(x = count, y = reorder(neighborhood, count), fill = round(avgstar, 2))) +
geom_bar(stat="identity", width=1, color="white") +
labs(x="review count", y="name of neighborhood")
## 3. Which neighborhood has most bars?
p3 <- ggplotly(neighborhoodgraph)
p3
topstar <- mutate(bar, topstar = ifelse(stars >= 4 & review_count >= 100, "Top bars", "Non top bars"))
toppct <- topstar %>%
group_by(topstar) %>%
summarize(count = n(), avgreview = mean(review_count))
## 4. The gap between topbars and non top bars
p4 <- tab_df(toppct)
p4
| topstar | count | avgreview |
|---|---|---|
| Non top bars | 1139 | 106.49 |
| Top bars | 240 | 529.58 |
topstar <- topstar %>%
filter(topstar == "Top bars") %>%
group_by(neighborhood) %>%
summarize(count = n(), avgstar = mean(stars))
topstar[1, 1] <- "Not known"
topbargraph <- ggplot(topstar, aes(x = count, y = reorder(neighborhood, count), fill = round(avgstar, 2))) +
geom_bar(stat="identity", width=1, color="white") +
labs(x="review count", y="name of neighborhood")
## 5. Which neighborhood has most top bars?
p5 <- ggplotly(topbargraph)
p5
## 6. What influences the elite users' review scores most who has reviewed in Las Vegas.
lm1 <- lm(average_stars ~ review_count + fans + useful + compliment_hot, elite)
summary(lm1)
##
## Call:
## lm(formula = average_stars ~ review_count + fans + useful + compliment_hot,
## data = elite)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.87542 -0.16202 0.01053 0.17069 0.75058
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.896e+00 1.803e-02 216.130 < 2e-16 ***
## review_count -8.430e-05 1.598e-05 -5.274 1.77e-07 ***
## fans 1.525e-04 4.488e-05 3.397 0.000719 ***
## useful 1.548e-07 6.045e-07 0.256 0.798000
## compliment_hot -2.032e-06 5.988e-06 -0.339 0.734405
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2631 on 718 degrees of freedom
## Multiple R-squared: 0.04107, Adjusted R-squared: 0.03573
## F-statistic: 7.689 on 4 and 718 DF, p-value: 4.549e-06
userstars <- ggplot(data = elite, aes(x = fans, y = average_stars)) +
stat_smooth(method = "lm", col = "blue") +
xlab("Number of fans") + ylab("Average stars")
p6 <- ggplotly(userstars)
## `geom_smooth()` using formula 'y ~ x'
p6
# 7. Does the amount of review have a positive correlation with the bars' reputation in Las Vegas.
lm2 <- lm(stars ~ review_count, bar, na.action = na.omit)
summary(lm2)
##
## Call:
## lm(formula = stars ~ review_count, data = bar, na.action = na.omit)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.6128 -0.6129 -0.1138 0.3844 1.3874
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.612e+00 2.018e-02 179.021 < 2e-16 ***
## review_count 1.361e-04 5.077e-05 2.681 0.00743 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6679 on 1377 degrees of freedom
## Multiple R-squared: 0.005193, Adjusted R-squared: 0.004471
## F-statistic: 7.188 on 1 and 1377 DF, p-value: 0.007425
barstars <- ggplot(data = bar, aes(x = review_count, y = stars)) +
stat_smooth(method = "lm", col = "blue") +
xlab("Number of reviews") + ylab("Average stars")
p7 <- ggplotly(barstars)
## `geom_smooth()` using formula 'y ~ x'
p7
map_lv <- get_map("Las Vegas",
zoom = 12,
source = "stamen",
maptype = "toner-background")
## Source : https://maps.googleapis.com/maps/api/staticmap?center=Las%20Vegas&zoom=12&size=640x640&scale=2&maptype=terrain&key=xxx
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=Las+Vegas&key=xxx
## Source : http://tile.stamen.com/toner-background/12/736/1604.png
## Source : http://tile.stamen.com/toner-background/12/737/1604.png
## Source : http://tile.stamen.com/toner-background/12/738/1604.png
## Source : http://tile.stamen.com/toner-background/12/739/1604.png
## Source : http://tile.stamen.com/toner-background/12/736/1605.png
## Source : http://tile.stamen.com/toner-background/12/737/1605.png
## Source : http://tile.stamen.com/toner-background/12/738/1605.png
## Source : http://tile.stamen.com/toner-background/12/739/1605.png
## Source : http://tile.stamen.com/toner-background/12/736/1606.png
## Source : http://tile.stamen.com/toner-background/12/737/1606.png
## Source : http://tile.stamen.com/toner-background/12/738/1606.png
## Source : http://tile.stamen.com/toner-background/12/739/1606.png
## Source : http://tile.stamen.com/toner-background/12/736/1607.png
## Source : http://tile.stamen.com/toner-background/12/737/1607.png
## Source : http://tile.stamen.com/toner-background/12/738/1607.png
## Source : http://tile.stamen.com/toner-background/12/739/1607.png
ggmap(map_lv)
g_location <- ggmap(map_lv) + theme_map()
g_location + geom_point(data=bar, aes(x=longitude,y=latitude),
size=0.3, alpha=0.3, color="blue")
## Warning: Removed 347 rows containing missing values (geom_point).
g_density <- g_location + geom_density2d(aes(x=longitude,y=latitude),
data=bar, color="green", size=1, bins=12) +
stat_density2d(aes(x=longitude,y=latitude,
fill = ..level.., alpha = ..level..),
data=bar, geom = 'polygon', bins=12) +
scale_fill_gradient2(low = "green", mid="yellow", high = "red") +
scale_alpha(range = c(0.00, 0.5))
g_density
## Warning: Removed 347 rows containing non-finite values (stat_density2d).
## Removed 347 rows containing non-finite values (stat_density2d).
# Visualize the neighborhood each bar belongs to
#add legend of stars
lvbar_map_neighborhood <-
leaflet(bar) %>%
addTiles() %>% # Add OpenStreetMap map tiles
addCircles(lng = ~longitude, lat = ~latitude)
pal = colorFactor("Set1", domain = bar$neighborhood) # Grab a palette
color_neighborhood = pal(bar$neighborhood)
## Warning in RColorBrewer::brewer.pal(max(3, n), palette): n too large, allowed maximum for palette Set1 is 9
## Returning the palette you asked for with that many colors
lvbar_map_neighborhood %>% addCircles(color=color_neighborhood) %>%
addLegend(pal = pal, values = ~bar$neighborhood, title = "Neighborhood")
## Assuming "longitude" and "latitude" are longitude and latitude, respectively
## Warning in RColorBrewer::brewer.pal(max(3, n), palette): n too large, allowed maximum for palette Set1 is 9
## Returning the palette you asked for with that many colors
content <- paste("Name:",bar$name,"<br/>",
"Address:",bar$address,"<br/>",
"Stars:",bar$stars,"<br/>",
"Neighborhood:", bar$neighborhood,"<br/>")
pal = colorFactor("YlOrRd", domain = bar$stars) # Grab a palette
color_stars = pal(bar$stars)
lvbar_map_neighborhood %>% addCircles(color=color_stars, popup = content) %>%
addLegend(pal = pal, values = ~bar$stars, title = "Stars")
## Assuming "longitude" and "latitude" are longitude and latitude, respectively
Most Popular Categories regarding bars:
fillColor = "#FFA07A"
fillColor2 = "#F1C40F"
categories = str_split(bar$categories,";")
categories = as.data.frame(unlist(categories))
colnames(categories) = c("Name")
categories %>%
group_by(Name) %>%
summarise(Count = n()) %>%
arrange(desc(Count)) %>%
ungroup() %>%
mutate(Name = reorder(Name,Count)) %>%
head(10) %>%
ggplot(aes(x = Name,y = Count)) +
geom_bar(stat='identity',colour="white", fill =fillColor2) +
geom_text(aes(x = Name, y = 1, label = paste0("(",Count,")",sep="")),
hjust=0, vjust=.5, size = 4, colour = 'black',
fontface = 'bold') +
labs(x = 'Name of Category', y = 'Count',
title = 'Top 10 Categories regarding bars') +
coord_flip() +
theme_bw()
Bars with most number of five Star Reviews:
stars_5 <- barreview %>%
filter(stars ==5) %>%
group_by(business_id) %>%
select(business_id,stars,text) %>%
summarise(Count = n()) %>%
arrange(desc(Count)) %>%
ungroup()
five = merge(stars_5, bar, by= "business_id")
five2 <- five %>%
filter(stars ==5) %>%
filter(is_open==1)
fivestar <- five2 %>%
arrange(stars) %>%
head(10)
fillColor2 = "#F1C40F"
fivestar %>%
mutate(name = reorder(name,Count)) %>%
ggplot(aes(x = name,y = Count)) +
geom_bar(stat='identity',colour="white", fill = fillColor2) +
geom_text(aes(x = name, y = 1, label = paste0("(",Count,")",sep="")),
hjust=0, vjust=.5, size = 2, colour = 'black',
fontface = 'bold') +
labs(x = 'Name of the Bars',
y = 'Count',
title = 'Name of the bars and Count') +
coord_flip() +
theme_bw()
Most 5 starred bar - J Karaoke Bar:
J_karaoke = bar %>% filter(business_id == "3pSUr_cdrphurO6m1HMP9A") %>%
select(name,neighborhood,city,state,postal_code,categories)
datatable(head(J_karaoke), style="bootstrap", class="table-condensed", options = list(dom = 'tp',scrollX = TRUE))
A wordcloud to see the common words of reviews on “J Karaoke Bar”
createWordCloud = function(train)
{
train %>%
unnest_tokens(word, text) %>%
filter(!word %in% stop_words$word) %>%
count(word,sort = TRUE) %>%
ungroup() %>%
head(30) %>%
with(wordcloud(word, n, max.words = 30,colors=brewer.pal(8, "Dark2")))
}
createWordCloud(barreview %>%
filter(business_id == "3pSUr_cdrphurO6m1HMP9A"))
From the wordcloud, we can derive the ingisht that people praise the atmosphere, music, cleaning environment, services and food(especially chicken) in this bar, and indicates that they spend happy and comfortable time in this J Karaoke bar.
Similarly, let’s visualize the bars with most number of one star reviews:
#library()
stars_1 <- barreview %>%
filter(stars ==1|stars==1.5) %>%
group_by(business_id) %>%
select(business_id,stars,text) %>%
summarise(Count = n()) %>%
arrange(desc(Count)) %>%
ungroup()
one = merge(stars_1, bar, by= "business_id")
one2 <- one %>%
filter(stars ==1|stars==1.5) %>%
filter(is_open==1)
onestar <- one2 %>%
arrange(stars) %>%
head(10)
fillColor2 = "#F1C40F"
onestar %>%
mutate(name = reorder(name,Count)) %>%
ggplot(aes(x = name,y = Count)) +
geom_bar(stat='identity',colour="white", fill = fillColor2) +
geom_text(aes(x = name, y = 1, label = paste0("(",Count,")",sep="")),
hjust=0, vjust=.5, size = 2, colour = 'black',
fontface = 'bold') +
labs(x = 'Name of the Bars',
y = 'Count',
title = 'Name of the bars and Count') +
coord_flip() +
theme_bw()
Surprisingly, the bar named “Triumph property management” only has one star rating, and there are 12 reviews on that bar.
So we are interested to see the common words of reviews on “Triumph property management”:
createWordCloud = function(train)
{
train %>%
unnest_tokens(word, text) %>%
filter(!word %in% stop_words$word) %>%
count(word,sort = TRUE) %>%
ungroup() %>%
head(30) %>%
with(wordcloud(word, n, max.words = 30,colors=brewer.pal(8, "Dark2")))
}
createWordCloud(barreview %>%
filter(business_id == "Zh6fxrqsKqdSVmTK3roxBQ"))
People in their reviews complain about the house/environment of the bar.
Let’s create a datatable to see some information regarding “Triumph property management”:
Triumph = bar %>% filter(business_id == "Zh6fxrqsKqdSVmTK3roxBQ") %>%
select(name,neighborhood,city,state,postal_code,categories)
datatable(head(Triumph), style="bootstrap", class="table-condensed", options = list(dom = 'tp',scrollX = TRUE))
define 5-star and 1-star bar datasets with reviews
goodbar <- barreview %>%
filter(stars == 5) %>%
group_by(business_id) %>%
ungroup()
star_five <- merge(goodbar,bar,by = "business_id")
badbar <- barreview %>%
filter(stars == 1) %>%
group_by(business_id) %>%
ungroup()
star_one <- merge(badbar,bar,by = "business_id")
preprocessing reviews
df_five = data.frame(doc_id = star_five$business_id, text = star_five$text,stringsAsFactors = F)
star_five2 <- DataframeSource(df_five)
star_five2 <- VCorpus(star_five2)
df_one = data.frame(doc_id = star_one$business_id, text = star_one$text,stringsAsFactors = F)
star_one2 <- DataframeSource(df_one)
star_one2 <- VCorpus(star_one2)
#Remove unnecessary words(stop words), synatx, punctuation, numbers, white space etc.
remove_nonalphanum <- function(x){str_replace_all(x, "[^[:alnum:]]", " ")}
remove_brandnames <- function(x){str_replace_all(x, "\\b[A-Z]+\\b", " ")}
clean_corpus <- function(corpus){
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, content_transformer(remove_nonalphanum))
corpus <- tm_map(corpus, content_transformer(remove_brandnames))
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removeWords, c(stopwords("en")))
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, stripWhitespace)
return(corpus)
}
#cleaning two datasets
star_five_clean <- clean_corpus(star_five2)
star_one_clean <- clean_corpus(star_one2)
#create a document-term-matrix:
#create the dtm from the corpus
corpus_five_dtm <- DocumentTermMatrix(star_five_clean)
corpus_one_dtm <- DocumentTermMatrix(star_one_clean)
#provide a word cloud of the most frequent words for "five_star" bars and "one_star" bars
corpus_five_dt <- tidy(corpus_five_dtm)
corpus_one_dt <- tidy(corpus_one_dtm)
head(corpus_five_dt)
## # A tibble: 6 x 3
## document term count
## <chr> <chr> <dbl>
## 1 --q7kSBRb0vWC8lSkXFByA bit 1
## 2 --q7kSBRb0vWC8lSkXFByA blvd 1
## 3 --q7kSBRb0vWC8lSkXFByA drive 1
## 4 --q7kSBRb0vWC8lSkXFByA football 1
## 5 --q7kSBRb0vWC8lSkXFByA games 1
## 6 --q7kSBRb0vWC8lSkXFByA great 3
#tf-idf
corpus_five_tdidf <- corpus_five_dt %>%
bind_tf_idf(term, document, count) %>%
arrange(desc(tf_idf))
## Warning: A value for tf_idf is negative:
## Input should have exactly one row per document-term combination.
corpus_one_tdidf <- corpus_one_dt %>%
bind_tf_idf(term, document, count) %>%
arrange(desc(tf_idf))
## Warning: A value for tf_idf is negative:
## Input should have exactly one row per document-term combination.
head(corpus_five_tdidf)
## # A tibble: 6 x 6
## document term count tf idf tf_idf
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 41n0hPBRTHmZLKXAhGx9vA townaaaa 1 0.25 7.19 1.80
## 2 JZG8d5KfxedhYEDJUXGAxw abdul 1 0.111 7.19 0.799
## 3 0JmCGyAqAK_yVEuZ0ts16w bilbos 1 0.1 7.19 0.719
## 4 s9RzET4jnh3pFhAYEqRgNA seriouslybartenders 1 0.0909 7.19 0.653
## 5 dd3lBUGkokm3P1ChWJnYuw freds 1 0.1 5.80 0.580
## 6 WM5lY9Yp9FFr7dsC-5sNYA jillians 2 0.0833 6.50 0.541
term_frequency_DT_five <- corpus_five_tdidf
term_frequency_DT_one <- corpus_one_tdidf
library(wordcloud)
#Set seed
set.seed(2103)
#create a wordcloud to show the frequent words of five stars bars
wordcloud(term_frequency_DT_five$term, term_frequency_DT_five$tf,
max.words=50, colors=brewer.pal(8, "Dark2"))
## Warning in wordcloud(term_frequency_DT_five$term, term_frequency_DT_five$tf, :
## hour could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_DT_five$term, term_frequency_DT_five$tf, :
## happy could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_DT_five$term, term_frequency_DT_five$tf, :
## sportsbook could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_DT_five$term, term_frequency_DT_five$tf, :
## guitar could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_DT_five$term, term_frequency_DT_five$tf, :
## best could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_DT_five$term, term_frequency_DT_five$tf, :
## townaaaa could not be fit on page. It will not be plotted.
#create a wordcloud to show the frequent words of one stars bars
wordcloud(term_frequency_DT_one$term, term_frequency_DT_one$tf,
max.words=50, colors=brewer.pal(8, "Dark2"))
## Warning in wordcloud(term_frequency_DT_one$term, term_frequency_DT_one$tf, :
## behind could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_DT_one$term, term_frequency_DT_one$tf, :
## march could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_DT_one$term, term_frequency_DT_one$tf, :
## plain could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_DT_one$term, term_frequency_DT_one$tf, :
## longer could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_DT_one$term, term_frequency_DT_one$tf, :
## open could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_DT_one$term, term_frequency_DT_one$tf, :
## service could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_DT_one$term, term_frequency_DT_one$tf, :
## sadden could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_DT_one$term, term_frequency_DT_one$tf, :
## either could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_DT_one$term, term_frequency_DT_one$tf, :
## wake could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_DT_one$term, term_frequency_DT_one$tf, :
## management could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_DT_one$term, term_frequency_DT_one$tf, : cab
## could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_DT_one$term, term_frequency_DT_one$tf, :
## place could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_DT_one$term, term_frequency_DT_one$tf, :
## closed could not be fit on page. It will not be plotted.
A pyramid plot to show how the words between five-stars and one-stars bars differ in word frequency:
#combine corpus of the most successful and unsuccessful projects
#select top 20 words
corpus_five_dt$bestworst <- "Top"
corpus_one_dt$bestworst <- "Bottom"
corpus_top_bottom_dt <- rbind(corpus_five_dt,corpus_one_dt)
corpus_top_bottom_count <- corpus_top_bottom_dt %>%
group_by(term) %>%
summarize(total_word = sum(count)) %>%
arrange(desc(total_word)) %>%
head(20)
pyramid = left_join(corpus_top_bottom_dt, corpus_top_bottom_count, by='term')
pyramid <- pyramid %>%
filter(!is.na(total_word)) %>%
group_by(bestworst) %>%
mutate(count_plot = ifelse(bestworst == 'Bottom', count*(-1), count))
ggplot(pyramid, aes(x = reorder(term, total_word),
y = count_plot, fill = bestworst)) +
geom_bar(data = filter(pyramid, bestworst == "Top"), stat = "identity") +
geom_bar(data = filter(pyramid, bestworst == "Bottom"), stat = "identity") +
scale_fill_brewer(palette = "Set1", direction=-1) +
coord_flip() +
scale_y_continuous(breaks = seq(-50,50,25)) +
scale_fill_discrete(name = 'bars star rating', labels=c('one star', 'five star')) +
ylab("") +
ggthemes::theme_tufte() +
labs(
x = 'Top 20 Words',
y= 'Count',
title = 'Pyramid Plot of Top 20 Words, for one star bars and five star bars'
)
## Scale for 'fill' is already present. Adding another scale for 'fill', which
## will replace the existing scale.
Sentiment analysis of reviews:
Positive v.s. negative words in the reivews of J Karaoke Bar
positiveWordsBarGraph <- function(SC) {
contributions <- SC %>%
unnest_tokens(word, text) %>%
count(word,sort = TRUE) %>%
ungroup() %>%
inner_join(get_sentiments("afinn"), by = "word") %>%
group_by(word) %>%
summarize(occurences = n(),
contribution = sum(value))
contributions %>%
top_n(20, abs(contribution)) %>%
mutate(word = reorder(word, contribution)) %>%
head(20) %>%
ggplot(aes(word, contribution, fill = contribution > 0)) +
geom_col(show.legend = FALSE) +
coord_flip() + theme_bw()
}
positiveWordsBarGraph(barreview %>%
filter(business_id == "3pSUr_cdrphurO6m1HMP9A"))
calculate sentiment for “J Karaoke Bar”
J_Karaoke_reviews = star_five %>%
filter(business_id == "3pSUr_cdrphurO6m1HMP9A")
calculate_sentiment <- function(review)
{
sentiment_lines = review %>%
unnest_tokens(word, text) %>%
inner_join(get_sentiments("afinn"), by = "word") %>%
group_by(user_id) %>%
summarize(sentiment = mean(value),words = n()) %>%
ungroup() %>%
filter(words >= 10)
return(sentiment_lines)
}
sentiment_lines = calculate_sentiment(J_Karaoke_reviews)
head(sentiment_lines)
## # A tibble: 6 x 3
## user_id sentiment words
## <chr> <dbl> <int>
## 1 2wxtnu-tA8i9HjHD55iU6g 1.91 11
## 2 3iocNPlPThAG2ZaNtUo4TQ 0.733 30
## 3 6vmfgZN3IC3toXRQdmKtBg 1.19 16
## 4 8dxkcmAXY4ttrVFD1GhbdQ 1.67 21
## 5 aF0BTeTVRXv4OHYXMNH7SQ 2.12 17
## 6 AVkp6nU-94QuRYZDxO6kpQ 1.54 13
Display top 10 most positive reviews for 5 star bars:
display_pos_sentiments <- function(sentiment_lines,review_text)
{
pos_sentiment_lines = sentiment_lines %>%
arrange(desc(sentiment)) %>%
top_n(10, sentiment) %>%
inner_join(review_text, by = "user_id") %>%
select(date,sentiment,text)
datatable(pos_sentiment_lines, style="bootstrap", class="table-condensed", options = list(dom = 'tp',scrollX = TRUE))
}
display_pos_sentiments(sentiment_lines,J_Karaoke_reviews)
Positive v.s. negative words in the reivews of Triumph property management
positiveWordsBarGraph <- function(SC) {
contributions <- SC %>%
unnest_tokens(word, text) %>%
count(word,sort = TRUE) %>%
ungroup() %>%
inner_join(get_sentiments("afinn"), by = "word") %>%
group_by(word) %>%
summarize(occurences = n(),
contribution = sum(value))
contributions %>%
top_n(20, abs(contribution)) %>%
mutate(word = reorder(word, contribution)) %>%
head(20) %>%
ggplot(aes(word, contribution, fill = contribution > 0)) +
geom_col(show.legend = FALSE) +
coord_flip() + theme_bw()
}
positiveWordsBarGraph(barreview %>%
filter(business_id == "Zh6fxrqsKqdSVmTK3roxBQ"))
Triumph_reviews = barreview %>%
filter(business_id == "Zh6fxrqsKqdSVmTK3roxBQ")
calculate_sentiment <- function(review)
{
sentiment_lines = review %>%
unnest_tokens(word, text) %>%
inner_join(get_sentiments("afinn"), by = "word") %>%
group_by(user_id) %>%
summarize(sentiment = mean(value),words = n()) %>%
ungroup() %>%
filter(words >= 10)
return(sentiment_lines)
}
sentiment_lines = calculate_sentiment(Triumph_reviews)
Display top 10 most negative reviews for Triumph property management:
display_neg_sentiments <- function(sentiment_lines,review_text)
{
neg_sentiment_lines = sentiment_lines %>%
arrange(desc(sentiment)) %>%
top_n(-10, sentiment) %>%
inner_join(review_text, by = "user_id") %>%
select(date,sentiment,text)
datatable(neg_sentiment_lines, style="bootstrap", class="table-condensed", options = list(dom = 'tp',scrollX = TRUE))
}
display_neg_sentiments(sentiment_lines,Triumph_reviews)
Visualize the geographical location of the top 10 five star bars(blue dot) and bottom 1 star or 1.5 star bars(yellow dot):
library(devtools)
## Loading required package: usethis
devtools::install_github("rstudio/leaflet")
## Downloading GitHub repo rstudio/leaflet@HEAD
## cli (3.2.0 -> 3.3.0) [CRAN]
## vctrs (0.4.0 -> 0.4.1) [CRAN]
## RColorBrewer (1.1-2 -> 1.1-3) [CRAN]
## tibble (3.1.6 -> 3.1.7) [CRAN]
## scales (1.1.1 -> 1.2.0) [CRAN]
## ggplot2 (3.3.5 -> 3.3.6) [CRAN]
## sp (1.4-6 -> 1.4-7) [CRAN]
## Installing 7 packages: cli, vctrs, RColorBrewer, tibble, scales, ggplot2, sp
## Warning: packages 'RColorBrewer', 'tibble', 'ggplot2' are in use and will not be
## installed
## Installing packages into 'C:/Users/DELL/Documents/R/win-library/4.1'
## (as 'lib' is unspecified)
## package 'cli' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'cli'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying C:
## \Users\DELL\Documents\R\win-library\4.1\00LOCK\cli\libs\x64\cli.dll to C:
## \Users\DELL\Documents\R\win-library\4.1\cli\libs\x64\cli.dll: Permission denied
## Warning: restored 'cli'
## package 'vctrs' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'vctrs'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying C:
## \Users\DELL\Documents\R\win-library\4.1\00LOCK\vctrs\libs\x64\vctrs.dll to C:
## \Users\DELL\Documents\R\win-library\4.1\vctrs\libs\x64\vctrs.dll: Permission
## denied
## Warning: restored 'vctrs'
## package 'scales' successfully unpacked and MD5 sums checked
## package 'sp' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'sp'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying C:
## \Users\DELL\Documents\R\win-library\4.1\00LOCK\sp\libs\x64\sp.dll to C:
## \Users\DELL\Documents\R\win-library\4.1\sp\libs\x64\sp.dll: Permission denied
## Warning: restored 'sp'
##
## The downloaded binary packages are in
## C:\Users\DELL\AppData\Local\Temp\RtmpaurNEv\downloaded_packages
## * checking for file 'C:\Users\DELL\AppData\Local\Temp\RtmpaurNEv\remotes2a2449137a8\rstudio-leaflet-0016c07/DESCRIPTION' ... OK
## * preparing 'leaflet':
## * checking DESCRIPTION meta-information ... OK
## * checking for LF line-endings in source and make files and shell scripts
## * checking for empty or unneeded directories
## Removed empty directory 'leaflet/docs/libs'
## Removed empty directory 'leaflet/docs'
## Removed empty directory 'leaflet/man-roxygen'
## * building 'leaflet_2.1.1.tar.gz'
##
## Warning: package 'leaflet' is in use and will not be installed
LasvegasCoords = bar %>% filter(city == "Las Vegas")
center_lon = median(LasvegasCoords$longitude,na.rm = TRUE)
center_lat = median(LasvegasCoords$latitude,na.rm = TRUE)
map <- leaflet(rbind(fivestar,onestar)) %>%
addProviderTiles("Esri.NatGeoWorldMap") %>%
addCircles(lng = ~longitude, lat = ~latitude,radius = ~sqrt(review_count)) %>%
addCircleMarkers(data=fivestar,col="blue",group="fivestar") %>%
addCircleMarkers(data=onestar,color='yellow',group="onestar") %>%
#Layers control
addLayersControl(overlayGroups = c("fivestar","onestar"),
options = layersControlOptions(collapsed = FALSE)
) %>%
# controls
setView(lng=center_lon, lat=center_lat,zoom = 13)
## Assuming "longitude" and "latitude" are longitude and latitude, respectively
## Assuming "longitude" and "latitude" are longitude and latitude, respectively
map
Top five stars bars are more centralized and concentrated; while the worst one star bars are relatively more sparse in their location, and are not close to transportation hub.